@@ -14,6 +14,46 @@ module WebRequestConcern |
||
| 14 | 14 |
end |
| 15 | 15 |
end |
| 16 | 16 |
|
| 17 |
+ class CharacterEncoding < Faraday::Middleware |
|
| 18 |
+ def initialize(app, force_encoding: nil, default_encoding: nil, unzip: nil) |
|
| 19 |
+ super(app) |
|
| 20 |
+ @force_encoding = force_encoding |
|
| 21 |
+ @default_encoding = default_encoding |
|
| 22 |
+ @unzip = unzip |
|
| 23 |
+ end |
|
| 24 |
+ |
|
| 25 |
+ def call(env) |
|
| 26 |
+ @app.call(env).on_complete do |env| |
|
| 27 |
+ body = env[:body] |
|
| 28 |
+ |
|
| 29 |
+ case @unzip |
|
| 30 |
+ when 'gzip'.freeze |
|
| 31 |
+ body.replace(ActiveSupport::Gzip.decompress(body)) |
|
| 32 |
+ end |
|
| 33 |
+ |
|
| 34 |
+ case |
|
| 35 |
+ when @force_encoding |
|
| 36 |
+ encoding = @force_encoding |
|
| 37 |
+ when body.encoding == Encoding::ASCII_8BIT |
|
| 38 |
+ # Not all Faraday adapters support automatic charset |
|
| 39 |
+ # detection, so we do that. |
|
| 40 |
+ case env[:response_headers][:content_type] |
|
| 41 |
+ when /;\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i
|
|
| 42 |
+ encoding = Encoding.find($1) rescue nil |
|
| 43 |
+ when /\A\s*(?:text\/[^\s;]+|application\/(?:[^\s;]+\+)?(?:xml|json))\s*(?:;|\z)/i |
|
| 44 |
+ encoding = @default_encoding |
|
| 45 |
+ else |
|
| 46 |
+ # Never try to transcode a binary content |
|
| 47 |
+ return |
|
| 48 |
+ end |
|
| 49 |
+ end |
|
| 50 |
+ body.encode!(Encoding::UTF_8, encoding) unless body.encoding == Encoding::UTF_8 |
|
| 51 |
+ end |
|
| 52 |
+ end |
|
| 53 |
+ end |
|
| 54 |
+ |
|
| 55 |
+ Faraday::Response.register_middleware character_encoding: CharacterEncoding |
|
| 56 |
+ |
|
| 17 | 57 |
extend ActiveSupport::Concern |
| 18 | 58 |
|
| 19 | 59 |
def validate_web_request_options! |
@@ -34,6 +74,23 @@ module WebRequestConcern |
||
| 34 | 74 |
rescue ArgumentError => e |
| 35 | 75 |
errors.add(:base, e.message) |
| 36 | 76 |
end |
| 77 |
+ |
|
| 78 |
+ if (encoding = options['force_encoding']).present? |
|
| 79 |
+ case encoding |
|
| 80 |
+ when String |
|
| 81 |
+ begin |
|
| 82 |
+ Encoding.find(encoding) |
|
| 83 |
+ rescue ArgumentError |
|
| 84 |
+ errors.add(:base, "Unknown encoding: #{encoding.inspect}")
|
|
| 85 |
+ end |
|
| 86 |
+ else |
|
| 87 |
+ errors.add(:base, "force_encoding must be a string") |
|
| 88 |
+ end |
|
| 89 |
+ end |
|
| 90 |
+ end |
|
| 91 |
+ |
|
| 92 |
+ def default_encoding |
|
| 93 |
+ Encoding::UTF_8 |
|
| 37 | 94 |
end |
| 38 | 95 |
|
| 39 | 96 |
def faraday |
@@ -44,6 +101,11 @@ module WebRequestConcern |
||
| 44 | 101 |
} |
| 45 | 102 |
|
| 46 | 103 |
@faraday ||= Faraday.new(faraday_options) { |builder|
|
| 104 |
+ builder.response :character_encoding, |
|
| 105 |
+ force_encoding: interpolated['force_encoding'].presence, |
|
| 106 |
+ default_encoding: default_encoding, |
|
| 107 |
+ unzip: interpolated['unzip'].presence |
|
| 108 |
+ |
|
| 47 | 109 |
builder.headers = headers if headers.length > 0 |
| 48 | 110 |
|
| 49 | 111 |
builder.headers[:user_agent] = user_agent |
@@ -51,7 +113,7 @@ module WebRequestConcern |
||
| 51 | 113 |
builder.use FaradayMiddleware::FollowRedirects |
| 52 | 114 |
builder.request :url_encoded |
| 53 | 115 |
|
| 54 |
- if boolify(options['disable_url_encoding']) |
|
| 116 |
+ if boolify(interpolated['disable_url_encoding']) |
|
| 55 | 117 |
builder.options.params_encoder = DoNotEncoder |
| 56 | 118 |
end |
| 57 | 119 |
|
@@ -29,6 +29,7 @@ module Agents |
||
| 29 | 29 |
* `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`. |
| 30 | 30 |
* `disable_ssl_verification` - Set to `true` to disable ssl verification. |
| 31 | 31 |
* `disable_url_encoding` - Set to `true` to disable url encoding. |
| 32 |
+ * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1). |
|
| 32 | 33 |
* `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
|
| 33 | 34 |
* `max_events_per_run` - Limit number of events created (items parsed) per run for feed. |
| 34 | 35 |
|
@@ -87,7 +87,7 @@ module Agents |
||
| 87 | 87 |
|
| 88 | 88 |
Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
|
| 89 | 89 |
|
| 90 |
- Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset. |
|
| 90 |
+ Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1). |
|
| 91 | 91 |
|
| 92 | 92 |
Set `user_agent` to a custom User-Agent name if the website does not like the default value (`#{default_user_agent}`).
|
| 93 | 93 |
|
@@ -157,19 +157,6 @@ module Agents |
||
| 157 | 157 |
errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back']) |
| 158 | 158 |
end |
| 159 | 159 |
|
| 160 |
- if (encoding = options['force_encoding']).present? |
|
| 161 |
- case encoding |
|
| 162 |
- when String |
|
| 163 |
- begin |
|
| 164 |
- Encoding.find(encoding) |
|
| 165 |
- rescue ArgumentError |
|
| 166 |
- errors.add(:base, "Unknown encoding: #{encoding.inspect}")
|
|
| 167 |
- end |
|
| 168 |
- else |
|
| 169 |
- errors.add(:base, "force_encoding must be a string") |
|
| 170 |
- end |
|
| 171 |
- end |
|
| 172 |
- |
|
| 173 | 160 |
validate_web_request_options! |
| 174 | 161 |
end |
| 175 | 162 |
|
@@ -284,12 +271,6 @@ module Agents |
||
| 284 | 271 |
interpolation_context.stack {
|
| 285 | 272 |
interpolation_context['_response_'] = ResponseDrop.new(response) |
| 286 | 273 |
body = response.body |
| 287 |
- if (encoding = interpolated['force_encoding']).present? |
|
| 288 |
- body = body.encode(Encoding::UTF_8, encoding) |
|
| 289 |
- end |
|
| 290 |
- if interpolated['unzip'] == "gzip" |
|
| 291 |
- body = ActiveSupport::Gzip.decompress(body) |
|
| 292 |
- end |
|
| 293 | 274 |
doc = parse(body) |
| 294 | 275 |
|
| 295 | 276 |
if extract_full_json? |